{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6ef57eb5-2295-44fb-81d9-4351769e8f4e",
   "metadata": {},
   "source": [
    "# L2: Normalizing the Content"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1138a09c-e3b2-46e0-9d42-e46a93ac534f",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px\"> ⏳ <b>Note <code>(Kernel Starting)</code>:</b> This notebook takes about 30 seconds to be ready to use. You may start and watch the video while you wait.</p>\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ec2430d-cac6-406e-a007-123dc97df0e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Warning control\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "591f213d-f54b-4c91-ae73-bcd2c7a4359e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import JSON\n",
    "\n",
    "import json\n",
    "\n",
    "from unstructured_client import UnstructuredClient\n",
    "from unstructured_client.models import shared\n",
    "from unstructured_client.models.errors import SDKError\n",
    "\n",
    "from unstructured.partition.html import partition_html\n",
    "from unstructured.partition.pptx import partition_pptx\n",
    "from unstructured.staging.base import dict_to_elements, elements_to_json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b97be736-2166-400c-9d17-6dcb4034c9eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "from Utils import Utils\n",
    "utils = Utils()\n",
    "\n",
    "DLAI_API_KEY = utils.get_dlai_api_key()\n",
    "DLAI_API_URL = utils.get_dlai_url()\n",
    "\n",
    "s = UnstructuredClient(\n",
    "    api_key_auth=DLAI_API_KEY,\n",
    "    server_url=DLAI_API_URL,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "be8a0cbd-1545-4dde-ab60-84dcdaa4a51a",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px\"> 💻 &nbsp; <b>Access Utils File and Helper Functions:</b> To access helper functions and other related files for this notebook, 1) click on the <em>\"View\"</em> option on the top menu of the notebook and then 2) click on <em>\"File Browser\"</em>. For more help, please see the <em>\"Appendix - Tips and Help\"</em> Lesson.</p>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ee01e76f-3773-4611-9ad8-5f990ab23eef",
   "metadata": {},
   "source": [
    "## Example Document: Medium Blog HTML Page"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8753e8a4-859c-41eb-8b5e-101a30ed5b2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import Image\n",
    "Image(filename=\"images/HTML_demo.png\", height=600, width=600)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0265c17e-d9db-4877-a804-74d94596a72e",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"example_files/medium_blog.html\"\n",
    "elements = partition_html(filename=filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68f55cc9-0d04-4260-a617-9f3ab0c6951f",
   "metadata": {},
   "outputs": [],
   "source": [
    "element_dict = [el.to_dict() for el in elements]\n",
    "example_output = json.dumps(element_dict[11:15], indent=2)\n",
    "print(example_output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41aa4434-7ff6-45c5-bf7d-ef870358e4bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "JSON(example_output)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e571077-9c60-499c-9668-c0e350148ce8",
   "metadata": {},
   "source": [
    "## Example Doc: MSFT PowerPoint on OpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36692869-ceeb-4eee-88bf-bdbebb198c70",
   "metadata": {},
   "outputs": [],
   "source": [
    "Image(filename=\"images/pptx_slide.png\", height=600, width=600) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60fffa73-2f75-447a-8596-e24cc3288039",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"example_files/msft_openai.pptx\"\n",
    "elements = partition_pptx(filename=filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3589bae1-3f4e-4b9e-b16c-8975e37287c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "element_dict = [el.to_dict() for el in elements]\n",
    "JSON(json.dumps(element_dict[:], indent=2))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5ecce31-081c-4e51-89ae-06cc8d9eaf2d",
   "metadata": {},
   "source": [
    "## Example Document: PDF on Chain-of-Thought"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "037167ed-21bf-4af3-84a4-21f4b88a0b1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "Image(filename=\"images/cot_paper.png\", height=600, width=600) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dcab4d73-f062-47ff-bc43-43b50753be6d",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = \"example_files/CoT.pdf\"\n",
    "with open(filename, \"rb\") as f:\n",
    "    files=shared.Files(\n",
    "        content=f.read(), \n",
    "        file_name=filename,\n",
    "    )\n",
    "\n",
    "req = shared.PartitionParameters(\n",
    "    files=files,\n",
    "    strategy='hi_res',\n",
    "    pdf_infer_table_structure=True,\n",
    "    languages=[\"eng\"],\n",
    ")\n",
    "try:\n",
    "    resp = s.general.partition(req)\n",
    "    print(json.dumps(resp.elements[:3], indent=2))\n",
    "except SDKError as e:\n",
    "    print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c44f3c8-cd94-440b-a9cc-01db643f44d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "JSON(json.dumps(resp.elements, indent=2))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f9e21878-24e1-4967-b5da-c3e0cdf8a69a",
   "metadata": {},
   "source": [
    "## Work With Your Own Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66704bc0-e59e-438e-9fef-2a0a7cffc376",
   "metadata": {},
   "outputs": [],
   "source": [
    "import panel as pn\n",
    "#import param\n",
    "from Utils import upld_file\n",
    "pn.extension()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8b8e99a-4d73-4d24-87a5-55fb8783867c",
   "metadata": {},
   "outputs": [],
   "source": [
    "upld_widget = upld_file()\n",
    "pn.Row(upld_widget.widget_file_upload)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2a04e4fa-6b55-4ff0-ad8b-508e81829b60",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px\"> 🖥 &nbsp; <b>Note:</b> If the file upload interface isn't functioning properly, the issue may be related to your browser version. In such a case, please ensure your browser is updated to the latest version, or try using a different browser.</p>\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "daaf85f4-f702-467c-b44b-80dcb98aae2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls ./example_files"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "691324b6-6291-4668-8be1-fb9044bb5b6d",
   "metadata": {},
   "source": [
    "<p style=\"background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px\"> 💻 &nbsp; <b>Uploading Your Own File - Method 2:</b> To upload your own files, you can also 1) click on the <em>\"View\"</em> option on the top menu of the notebook and then 2) click on <em>\"File Browser\"</em>. Then 3) click on <em>\"Upload\"</em> button to upload your files. For more help, please see the <em>\"Appendix - Tips and Help\"</em> Lesson.</p>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bac37496-46de-4a6c-a2c2-b4749722da64",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba1415fc-e652-4ecc-94ac-0867d101c919",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11073a73-f21d-4e1e-94a7-1f4a57cbc95f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a20d475a-1185-4185-9f57-794e1536e4c9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "afd68abd-967a-4ee1-bd12-0309e595cae4",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2eee2322-6829-4d3a-9666-6b6440c7d074",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
